library("tidyverse")
library("GGally")
library("corrplot")
library("reshape2")
library("plotly")
library(MASS)
library(ggplot2) # plot
library(calibrate) # textxy
library(car) #vif, durbin-watson test
library(orcutt) # To fix atuto-correlation issue
library(leaps) #regsubset
library(qpcR) #To calculate PRESS Residual and PRESS Statistic
library(factoextra)# PCA graphs
library(nortest) # for normality testing - Anderson-Darling etc
options(repr.plot.height=8, repr.plot.res=300)
boston_dataset <- read_csv("../datasets/BostonHousing.csv")
boston_dataset <- boston_dataset %>%
dplyr::select(MEDV, everything())
head(boston_dataset)
names(boston_dataset)
Following information we got from the official site: https://archive.ics.uci.edu/ml/machine-learning-databases/housing
There are 14 attributes in each case of the dataset. They are:
MEDV - Price of the houses is our target variable.
dim(boston_dataset)
So our data has 506 observations of 14 columns.
str(boston_dataset)
We can see that all the features are already in numerical double formats.
summary(boston_dataset)
In the above summary we can see the helpful statistics such as min, max, mean, median and quartiles for all of the columns.
colSums(is.na(boston_dataset))
Inference: There is no NULL or missing values in our data. We will further investigate upon this.
options(repr.plot.height=12)
ggpairs(boston_dataset)
In the above ggpairs plot we can see the correlation, denisty curve and the scatterplot between pairs simultaenously. Below we will see more plot specifically focused on the correlation and density curves then derive inferences.
Inference:
options(repr.plot.height=8)
corrplot.mixed(cor(boston_dataset), lower="number", upper="circle",tl.pos = "lt")
Inference: From the above data we can see the correlation between features. We can infer:
Because MEDV is our target variable and we see two highly correlated features with it, we are interested in the jointdensity plot for them. They are given below:
options(repr.plot.height=5, repr.plot.res=200)
# devtools::install_github("WinVector/WVPlots")
suppressMessages(library(WVPlots))
suppressWarnings( ScatterHist( boston_dataset, "MEDV", "LSTAT", title="Joint Density plot of MEDV - LSTAT"))
suppressWarnings( ScatterHist( boston_dataset, "MEDV", "RM", title="Joint Density plot of MEDV - RM"))
Inference:
library(rgl)
# Plot
plot3d(
x=boston_dataset$LSTAT, y=boston_dataset$RM, z=boston_dataset$MEDV,
# col = data$CHAS,
type = 's',
radius = .1,
xlab="LSTAT", ylab="RM", zlab="MEDV")
# To display in an R Markdown document:
rglwidget()
# To save to a file:
htmlwidgets::saveWidget(rglwidget(width = 800, height = 800),
file = "3D-Scatterplot-MEDV-LSTAT-RM.html",
libdir = "libs",
selfcontained = FALSE
)
Inference:
#setting the resolution of the plots
par(mfrow=c(1,3))
options(repr.plot.height=3,repr.plot.width=15, repr.plot.res=150)
# a function to plot density curve, histogram and boxplot of a feature simultaenously.
powerPlot <- function(x, name)
{
dens <- density(x)
# Histogram
hist(x, probability = TRUE, xlab = name, ylab="", ylim= c(0, max(dens$y)), col = "grey",
axes = FALSE, main = "", cex.lab=2)
# Axis
axis(1, cex.axis=2)
# Density
lines(dens, col = "red", lwd = 1.5)
# Add boxplot
par(new = TRUE)
boxplot(x, horizontal = TRUE, axes = FALSE,
lwd = 2, col = rgb(0, 1, 1, alpha = 0.15), outlier.colour="red")
}
# a loop to run powerPlot() for all of the features
for (x in names(boston_dataset)){
powerPlot( boston_dataset[[x]], x)
}
Inference:
options(repr.plot.height=12, repr.plot.res=200)
rel <- boston_dataset %>%
melt(id.vars = "MEDV") %>%
ggplot(aes(x = value, y = MEDV, colour = variable), na.rm=TRUE) +
geom_point(alpha = 0.7, na.rm=TRUE) +
stat_smooth(aes(colour = "black"), na.rm=TRUE) +
facet_wrap(~variable, scales = "free", ncol = 2) +
labs(x = "Variable Value", y = "Median House Price ($1000s)") +
theme_minimal( base_size=18) +
theme( legend.text = element_text( size=14))
suppressMessages(suppressWarnings( print(rel)))
Inference:
options(repr.plot.height=5, repr.plot.res=200)
boston_dataset %>%
ggplot(aes(MEDV)) +
stat_density(fill="white", col="blue") +
theme_minimal( base_size=18) +
ggtitle("MEDV-Price of houses Density curve") + theme( plot.title = element_text( hjust=0.5))
Inference:
count=0
outliers=c()
for(i in 1:nrow(boston_dataset)){
x <- boston_dataset[i,]
for(col in names(boston_dataset)){
mu=mean(boston_dataset[[col]])
sigma=sd(boston_dataset[[col]])
if(x[[col]]<mu-3*sigma || x[[col]]>mu+3*sigma)
{
count=count+1
}
}
if(count>0){
outliers <- c(outliers, i)
}
count=0
}
cat("No. of Outliers:",length(outliers))
## backward elimination
BER=regsubsets(MEDV~.,data=boston_dataset,method="backward")
Modelsummary=cbind(summary(BER)$which,R2=summary(BER)$rsq,SSres=summary(BER)$rss,AdjR2=summary(BER)$adjr2,Cp=summary(BER)$cp,BIC=summary(BER)$bic)
Modelsummary
Significant Features : CRIM , NOX , RM , DIS , RAD , PTRATIO , B , LSTAT
## forward substituition
FSR=regsubsets(MEDV~.,data=boston_dataset,method="forward")
# summary(FSR)
Modelsummary=cbind(summary(FSR)$which,R2=summary(FSR)$rsq,SSres=summary(FSR)$rss,AdjR2=summary(FSR)$adjr2,Cp=summary(FSR)$cp,BIC=summary(FSR)$bic)
Modelsummary
Significant Features : ZN , CHAS , NOX , RM , DIS , PTRATIO , B , LSTAT
#Stepwise Regression
SWR=regsubsets(MEDV~.,data=boston_dataset,method="seqrep")
Modelsummary=cbind(summary(SWR)$which,R2=summary(SWR)$rsq,SSres=summary(SWR)$rss,AdjR2=summary(SWR)$adjr2,Cp=summary(SWR)$cp,BIC=summary(SWR)$bic)
Modelsummary
Significant Features : ZN , CHAS , NOX , RM , DIS , PTRATIO , B , LSTAT
We will create a model taking the intersection of features suggested by the above three methods.
The glance of data of the suggested features is given below.
boston_dataset <- boston_dataset %>% dplyr::select(c(MEDV,ZN,CHAS,NOX,RM,DIS,PTRATIO,B,LSTAT))
head(boston_dataset)
modelm <- lm(boston_dataset$MEDV ~ ., data =boston_dataset )
summary(modelm)
# RMSE for checking the accuracy of model
cat("RMSE:", sqrt(sum(modelm$residuals^2)/modelm$df))
Model-1
RMSE = 4.84
Adjusted R-Squared = 0.722
vif(modelm)
Inference:
## Checking for Influence Points
inflm.boston <- influence.measures(modelm)
inf_pts<-which(apply(inflm.boston$is.inf, 1, any))
# which observations 'are' influential
dim(summary(inflm.boston)) # only these
Inference:
Since this can affect our model adversely we check which influence points lies outside 2-sigma limits and remove those points.
#boston_dataset<-boston_dataset[-inf_pts,]
count=0
outliers=c()
for(i in inf_pts){
x <- boston_dataset[i,]
for(col in names(boston_dataset)){
mu=mean(boston_dataset[[col]])
sigma=sd(boston_dataset[[col]])
if(x[[col]]<mu-2*sigma || x[[col]]>mu+2*sigma)
{
count=count+1
}
}
if(count>0){
outliers <- c(outliers, i)
}
count=0
}
length(outliers)
Inference:
Removing all 54 points resulted in better model.
# removing outlier-influence-points
boston_dataset<-boston_dataset[-inf_pts,]
dim(boston_dataset)
names(boston_dataset)
modelm <- lm(boston_dataset$MEDV ~ ., data =boston_dataset )
summary(modelm)
cat("RMSE:", sqrt(sum(modelm$residuals^2)/modelm$df))
Model-1.2
RMSE = 3.59
Adjusted R-Squared = 0.81
Thus we got considerable improvement in RMSE from 4.84 to 3.59.
vif(modelm)
Inference: There is no multicollinearity.
par(mfrow=c(1,3))
options(repr.plot.height=3,repr.plot.width=15, repr.plot.res=150)
powerPlot <- function(x, name) {
dens <- density(x)
# Histogram
hist(x, probability = TRUE, xlab = name, ylab="", ylim= c(0, max(dens$y)), col = "grey",
axes = FALSE, main = "", cex.lab=2)
# Axis
axis(1, cex.axis=2)
# Density
lines(dens, col = "red", lwd = 1.5)
# Add boxplot
par(new = TRUE)
boxplot(x, horizontal = TRUE, axes = FALSE,
lwd = 2, col = rgb(0, 1, 1, alpha = 0.15), outlier.colour="red")
}
# a loop to run powerPlot() for all of the features
for (x in names(boston_dataset)){
powerPlot( boston_dataset[[x]], x)
}
Inference:
??qqnorm
# Test-1. QQ Plot
options(repr.plot.height=8, repr.plot.res=300)
ors <- rstandard(modelm)
qqnorm(ors, pch = 1, frame = FALSE, cex.lab=1.5, cex.axis=1.5, main="Normal QQ Plot of Residuals", cex.main=2)
qqline(ors, col = "steelblue", lwd = 2)
# Test-2
shapiro.test(rstandard(modelm))
# Test-3: Anerson Darling Test
ad.test(rstandard(modelm))
Inference: From the above tests we can infer following:
We estimate this result due to the fact that the target variable in the data wasnt distributed normally.
## linear relation between residuals and X's
par(mfrow=c(3,3))
for (x in names(boston_dataset)){
plot(boston_dataset[[x]],rstudent(modelm),xlab = x,ylab = "rst(ti)", cex.lab=2, cex.axis=2)}
Inference:
# GETTING Principal Components
boston_x <- boston_dataset[,c(-1)]
#Scale the data
res.pca <- prcomp(boston_x, retx=TRUE,center = FALSE, scale = TRUE)
summary(res.pca)
Eigen values or equivalently variance of Principal Components: (The same information is available in the above summary)
eigval<- get_eigenvalue(res.pca)
eigval
options( repr.plot.height=5)
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0,100), ggtheme = theme_minimal(base_size=18)+ theme( plot.title = element_text( hjust=0.5)))
Inference:
Below we will look further into PC-1 and PC-2.
library(factoextra)
options(repr.plot.height=5, repr.plot.res=300)
# Contributions of variables to PCs
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10, ggtheme = theme_minimal(base_size=18)+ theme( plot.title = element_text( hjust=0.5)))
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10, ggtheme = theme_minimal(base_size=18)+ theme( plot.title = element_text( hjust=0.5)))
Inference:
The below chart shows the same thing.
fviz_pca_var(res.pca, col.var = "blue", ggtheme = theme_minimal(base_size=18))
##model with PCA1 suggested features data.
model_pca2 = lm(MEDV~RM+PTRATIO+DIS+LSTAT,data=boston_dataset)
summary(model_pca2)
cat("Adjusted R squared:",summary(model_pca2)$adj.r.squared,"\n")
cat("RMSE:", sqrt(sum(model_pca2$residuals^2)/model_pca2$df))
Model-2
RMSE = 3.84
Adjusted R-squared = 0.79
The lm() model is built based upon the suggested features of all of the previous methods. The influental outliers also have been removed in previous section.
lm_model = lm(MEDV~., data=boston_dataset)
summary(lm_model)
cat("Adjusted R squared:",summary(lm_model)$adj.r.squared,"\n")
cat("RMSE:", sqrt(sum(lm_model$residuals^2)/lm_model$df))
FInal Model
RMSE = 3.59
Adjusted R-Squared = 0.81
Above is the best value we got for RMSE and Adjusted R-squared among all of our models.
options( repr.plot.height=8)
pred<- predict(lm_model, boston_dataset)
act_vs_pred<- data.frame(actual= boston_dataset$MEDV, predicted= pred)
ggplot(act_vs_pred, aes(x= predicted, y=actual))+
geom_point()+
geom_abline(intercept=0, slope=1)+
theme_minimal(base_size=18) + theme( plot.title = element_text( hjust=0.5)) +
labs(x= 'Predicted Values', y= "Actual Values", title= "Predicted vs Actual Values")
Inference:
options(repr.plot.res=300, repr.plot.height=8)
res <- as.data.frame(residuals(lm_model))
ggplot(res,aes(residuals(lm_model)))+
geom_histogram(fill='brown',alpha=0.5, binwidth=1) +
theme_minimal( base_size=18) + theme( plot.title = element_text( hjust=0.5)) + labs(title="Distribution of Residuals")
Inference:
options(repr.plot.height=12,repr.plot.res=300)
par(mfrow = c(2,2))
plot(lm_model, cex.lab=1.5, cex.axis=1.5)
suppressMessages(library(caret))
set.seed(998)
trainPartitionRows <- createDataPartition(boston_dataset$MEDV, p = .90, list = FALSE)
cat("Total no of observations = ", nrow(boston_dataset),"\n")
cat("# for training = ", nrow(trainPartitionRows),"\n")
cat("# for testing = ", nrow(boston_dataset) - nrow(trainPartitionRows),"\n")
trainDataset <- boston_dataset[ trainPartitionRows,]
testDataset <- boston_dataset[-trainPartitionRows,]
cat("\nHead of TRAIN DATASET:")
head(trainDataset)
cat("\nHead of TEST DATASET:")
head(testDataset)
fitControl <- trainControl(## 10-fold CV
method = "cv", # repeatedcv
number = 10
## repeated ten times
#repeats = 10,
)
CVModel <- train(MEDV ~ .,
data = trainDataset,
method = "lm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
#verbose = TRUE
)
CVModel
summary(CVModel)
CV Model
RMSE = 3.55
Adjusted R-squared = 0.82
Mean Absolute Error (MAE) = 2.69
#validation on training data
pred<- predict(CVModel, trainDataset)
validation<- data.frame(actual= trainDataset$MEDV, predicted= pred)
ggplot(validation, aes(x= predicted, y=actual))+
geom_point()+
geom_abline(intercept=0, slope=1)+
theme_minimal(base_size=18) + theme( plot.title = element_text( hjust=0.5)) +
labs(x= 'Predicted Values', y= "Actual Values", title= "Predicted vs Actual Values")
Inference:
residuals<- (validation$actual - validation$predicted)^2
RMSE <- sqrt(mean(residuals))
summary(pred)
cat("RMSE on train data:",RMSE)
#TESTING
pred1<- predict(CVModel, newdata= testDataset)
test<- data.frame(actual= testDataset$MEDV, predicted= pred1)
ggplot(test, aes(x= predicted, y=actual))+
geom_point()+
geom_abline(intercept=0, slope=1)+
theme_minimal(base_size=18) + theme( plot.title = element_text( hjust=0.5)) +
labs(x= 'Predicted Values', y= "Actual Values", title= "Predicted vs Actual Values")
summary(pred1)
residuals1<- (test$actual - test$predicted)^2
RMSE1 <- sqrt(mean(residuals1))
cat("RMSE on test data:",RMSE1)
Inference:
RMSE on test data: 3.94630
RMSE on train data: 3.513884
n=nrow(trainDataset)
# create empty data frame
learnCurve <- data.frame(m=double(n),
trainRMSE=double(n),
cvRMSE = double(n))
# test data response feature
testY <- testDataset$MEDV
# Run algorithms using 10-fold cross validation with 3 repeats
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "RMSE"
# loop over training examples
for (i in 137:nrow(trainDataset)) {
learnCurve$m[i] <- i
# train learning algorithm with size i
fit.lm <- train(MEDV~.,
data=trainDataset[1:i,],
method="lm",
metric="RMSE",
#preProc=c("center", "scale"),
trControl=trainControl)
learnCurve$trainRMSE[i] <- fit.lm$results$RMSE
# use trained parameters to predict on test data
prediction <- predict(fit.lm, newdata = testDataset[,-1])
rmse <- postResample(prediction, testY)
learnCurve$cvRMSE[i] <- rmse[1]
}
plot(log(learnCurve$trainRMSE),type = "o",col = "red",xlab = "Training set size",
ylab = "Error (RMSE)", main = "Linear Model Learning Curve", ylim=c(1,1.50), xlim=c(136,410), cex.axis=1.5, cex.lab=1.5)
lines(log(learnCurve$cvRMSE), type = "o", col = "blue")
legend('topright', c("Train error", "Test error"), lty = c(1,1), lwd = c(2.5, 2.5),
col = c("red", "blue"))
Since the best performing model is CV model thus we take same as the CV model.
final_model <- CVModel
saveRDS(final_model, "./model.rds")
summary(final_model)
Final Model(CV Model)
RMSE = 3.55
Adjusted R-squared = 0.82
Mean Absolute Error (MAE) = 2.69